Dataset

We will use Gapminder Population Data version 7. It can be downloaded from here. Download this MS Excel file and save it in the data directory of this repo as population_data.xlsx.

R Packages

Download the required packages.

Note: Run these commands manually in the R console

install.packages("rnaturalearth")
install.packages("stringr")
install.packages("readxl")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("plotly")

Attach these packages

library(rnaturalearth)
library(stringr)
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Capturing and cleaning the data

Capture the World Map

world_map = ne_countries(returnclass = "sf")

Read the dowloaded Population Data Xlsx and understand its structure

Note: the required data is in the 4th sheet of the MS Excel file.

world_pop = read_xlsx("data/population_data.xlsx", sheet = 4)
names(world_pop)
## [1] "geo"        "name"       "time"       "Population"

Check for the differences in country names in the Population data and the world map data (for eg “United States” vs “USA”)

setdiff(world_map$sovereignt, world_pop$name)
##  [1] "Antarctica"                       "The Bahamas"                     
##  [3] "Ivory Coast"                      "Democratic Republic of the Congo"
##  [5] "Republic of Congo"                "Northern Cyprus"                 
##  [7] "Guinea Bissau"                    "Kyrgyzstan"                      
##  [9] "Kosovo"                           "Laos"                            
## [11] "Macedonia"                        "United States of America"        
## [13] "Western Sahara"                   "Somaliland"                      
## [15] "Republic of Serbia"               "Slovakia"                        
## [17] "East Timor"                       "United Republic of Tanzania"

Fix the anomallies and rename the column in the Population data to match with the column name in the world map data

world_pop = world_pop %>% 
  mutate(sovereignt = name) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Tanzania", "United Republic of Tanzania")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "United States", "United States of America")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Congo, Dem. Rep.", "Democratic Republic of the Congo")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Bahamas", "The Bahamas")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Serbia", "Republic of Serbia")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Macedonia, FYR", "Macedonia")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Slovak Republic", "Slovakia")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Congo, Rep.", "Republic of Congo")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Kyrgyz Republic", "Kyrgyzstan")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Lao", "Laos")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Cote d'Ivoire", "Ivory Coast")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Timor-Leste", "East Timor")) %>% 
  mutate(sovereignt = replace(sovereignt, sovereignt == "Guinea-Bissau", "Guinea Bissau"))

Preparing the data

Now we can join our two datasets, pick the columns we want for visualizing and remove missing values.

world_data = inner_join(world_map, world_pop, by = "sovereignt") %>%
  select(geo, sovereignt, Population, time, geometry ) %>%
  na.omit()
head(world_data)
## Simple feature collection with 6 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 60.52843 ymin: 29.31857 xmax: 75.15803 ymax: 38.48628
## CRS:           +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
##   geo  sovereignt Population time                       geometry
## 1 afg Afghanistan    3280000 1800 MULTIPOLYGON (((61.21082 35...
## 2 afg Afghanistan    3280000 1801 MULTIPOLYGON (((61.21082 35...
## 3 afg Afghanistan    3280000 1802 MULTIPOLYGON (((61.21082 35...
## 4 afg Afghanistan    3280000 1803 MULTIPOLYGON (((61.21082 35...
## 5 afg Afghanistan    3280000 1804 MULTIPOLYGON (((61.21082 35...
## 6 afg Afghanistan    3280000 1805 MULTIPOLYGON (((61.21082 35...

We are only visualizing the data every two years, so we will remove the rest of the data too.

world_data = world_data %>% 
  filter(time %in% seq(1800, 2100, by = 20))
head(world_data)
## Simple feature collection with 6 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 60.52843 ymin: 29.31857 xmax: 75.15803 ymax: 38.48628
## CRS:           +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
##   geo  sovereignt Population time                       geometry
## 1 afg Afghanistan    3280000 1800 MULTIPOLYGON (((61.21082 35...
## 2 afg Afghanistan    3288817 1820 MULTIPOLYGON (((61.21082 35...
## 3 afg Afghanistan    3586362 1840 MULTIPOLYGON (((61.21082 35...
## 4 afg Afghanistan    3922032 1860 MULTIPOLYGON (((61.21082 35...
## 5 afg Afghanistan    4288021 1880 MULTIPOLYGON (((61.21082 35...
## 6 afg Afghanistan    4707744 1900 MULTIPOLYGON (((61.21082 35...

Plotting

ggplotly(
  ggplot(
    world_data,
    aes(geometry = geometry, frame = time)
    ) +
    geom_sf(aes(fill= Population)) +
    theme_void() +
    ggtitle("World Population Prediction 1800 - 2100") +
    scale_fill_distiller(
      palette = "RdBu",
      label = number_format(scale=1e-6, suffix = "M"),
      limits = c(0, 2000000000)
      )
  )